Here will be analyzed the data provided regarding Airbnb bookings and property listings. In this analysis the following questions will be addressed:
All the data was provided by Nova School of Business and Economics' research center: Data Science for Social Good (DSSG).
This analysis is focused on Tourism Management in Portugal and was developed for Turismo de Portugal, a public national institution responsible for promote, enrich and sustain Tourism in the country.
The report is structured as follows:
WILL BE CHANGED A LOT
Let's start by importing some necessary libraries and specify tokens and configurations:
import pandas as pd
import numpy as np
import datetime
from numba import jit
import sklearn.cluster as sk_cluster
# visualizations
import plotly.graph_objs as go
import plotly
import matplotlib
import matplotlib.pyplot as plt
plotly.offline.init_notebook_mode()
pd.options.mode.chained_assignment = None # default = 'warn'
mapbox_token = 'pk.eyJ1Ijoiam9hb2ZvbnNlY2EiLCJhIjoiY2picXB3cDVvMDczYjJ3bzBxNDV3dGI0MSJ9.XpQDNjTuMAM-xckGln0KrA'
preprocess_dir = '_data_out/'
viz_out = '_viz_out/'
Next, we will check what kind of data we have in each table:
properties = pd.read_csv('data/Portugal_Property_2018-02-02.csv')
daily = pd.read_csv('data/Portugal_Daily_2018-02-02.csv')
monthly = pd.read_csv('data/Portugal_Monthly_2018-02-02.csv')
reviews = pd.read_csv('data/Portugal_Review_2018-02-02.csv')
tables = dict(properties = properties,
daily = daily,
monthly = monthly,
reviews = reviews)
for table in ['properties', 'daily', 'monthly', 'reviews']:
print(('Columns in table %s:\n' % table ), list(tables[table]),'\n')
We'll start by analysing the Properties table. It contains thorough information regarding Property listings in the Airbnb platform, from which we will use only a subset of this data:
props_df = properties[['Property ID', 'Property Type', 'Listing Type', 'Created Date', 'City',
'Annual Revenue LTM (USD)', 'Average Daily Rate (Native)', 'Average Daily Rate (USD)',
'Number of Bookings LTM', 'Max Guests', 'Latitude', 'Longitude']]
Let's assess how the listings are distributed across Portugal:
data = go.Data([
go.Scattermapbox(
lat = properties['Latitude'],
lon = properties['Longitude'],
mode='markers',
marker=go.Marker(size= 2),
text = properties['Annual Revenue LTM (USD)']
)
])
layout = go.Layout(
title='Location of listings',
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_token,
bearing=0,
style='light',
center=dict(
lat=39.64,
lon=-7.95,
),
pitch=0,
zoom=4.2
),
)
wonder_map = go.Figure(data=data, layout=layout)
#generate interactive visualization
plotly.offline.plot(wonder_map, filename=viz_out+'geographic_analysis_plot.html', show_link=False, auto_open=False)
#div_file = plotly.offline.plot(wonder_map, include_plotlyjs=False, output_type='div')
#with open('divs/geographic_analysis_plot.html', 'w') as file:
# file.write(div_file)
#show plot in notebook:
plotly.offline.iplot(wonder_map, show_link=False)
Note: Due to the size of the dataset, geographic visualizations may have slower processing performance. To visualize the plot, open the specified html file.
This plot clarifies that listings are extremely concentrated in 3 main regions: Greater Lisbon, Greater Porto and Algarve region. Additionally, there is a significant concentration of listings in the centers of district capitals, as well as coastal regions.
We will now format the remaining columns to their proper formats and check for extreme values or outliers in the remaining data:
props_df['Created Date'] = pd.to_datetime(props_df['Created Date'])
# Converting Created Date to timedelta days to allow plotting
props_df['created_date_formatted'] = pd.DatetimeIndex ( props_df['Created Date'] ).astype ( np.int64 )
print( 'Number of observations in dataset: ', len(props_df['Property ID']) )
# plotting interval variables
for column in props_df:
# Excluding columns with non interval data types
if column not in ['Property ID','Property Type', 'Listing Type', 'City', 'Created Date', 'Created Date2', 'Latitude', 'Longitude']:
# plot histogram
plt.subplot(1,2,1)
plt.hist(x=props_df[column].dropna(), bins=100)
# plot box plot
plt.subplot(1,2,2)
plt.boxplot(x=props_df[column].dropna())
plt.suptitle(column)
plt.subplots_adjust(wspace = 0.4)
plt.show()
# Computing IQR
Q1 = props_df[column].dropna().quantile(0.25)
Q3 = props_df[column].dropna().quantile(0.75)
IQR = Q3 - Q1
# Filtering Values between Q1-1.5IQR and Q3+1.5IQR
_filtered = props_df[ Q1-1.5*IQR < props_df[column]]
filtered = _filtered[ _filtered[column] < Q3+1.5*IQR ]
if column == 'created_date_formatted':
print('Minimum value: ', min(props_df['Created Date']), ' Maximum Value: ', max(props_df['Created Date']))
else:
print('Minimum value: ', min(props_df[column]), ' Maximum Value: ', max(props_df[column]))
print('Count of observations outside of Inter Quartile Range: ', len(props_df[column])-len(filtered[column]))
# plot class variables
for column in ['Property Type', 'Listing Type', 'City']:
# plot bar chart
props_df[column].value_counts().plot(kind='bar', figsize=(22, 6), title=column)
#plt.set_title(column)
plt.show()
(data analysis, refer to outliers, extreme values, distribution, etc)
During Clustering: Decide cutoff levels for extreme values
The Daily table contains bookings made in the platform in Portugal on a daily basis. As assessed previously, this is the data available in the table:
['Property ID', 'Date', 'Status', 'Booked Date', 'Price (USD)', 'Price (Native)', 'Currency Native', 'Reservation ID']
# Select the columns we'll be using:
day_df = daily[[ 'Property ID', 'Date', 'Booked Date', 'Price (USD)', 'Reservation ID' ]]
We will now format the remaining columns to their proper formats and check for extreme values or outliers in the remaining data:
day_df['Booked Date'] = pd.to_datetime(day_df['Booked Date'])
day_df['Date'] = pd.to_datetime(day_df['Date'])
# Converting Created Date to an integer format to allow plotting
day_df['booked_date_format'] = pd.DatetimeIndex(day_df['Booked Date']).astype(np.int64)
day_df['date_format'] = pd.DatetimeIndex ( day_df['Date'] ).astype ( np.int64 )
print( 'Number of observations in dataset: ', len(day_df['Reservation ID']) )
# plotting interval variables (numba.jit is used here to accelerate the processing time)
@jit
def generate_viz_daily():
for column in day_df.columns:
# Excluding columns with non interval data types
if column not in ['Property ID', 'Reservation ID', 'Date', 'Booked Date']:
# plot histogram
plt.subplot(1,2,1)
plt.hist(x=day_df[column].dropna(), bins=100)
# plot box plot
plt.subplot(1,2,2)
plt.boxplot(x=day_df[column].dropna())
# Plot configurations
plt.subplots_adjust(wspace = 0.4)
plt.suptitle(column)
plt.show()
# Computing IQR
Q1 = day_df[column].dropna().quantile(0.25)
Q3 = day_df[column].dropna().quantile(0.75)
IQR = Q3 - Q1
# Filtering Values between Q1-1.5IQR and Q3+1.5IQR
_filtered = day_df[ Q1-1.5*IQR < day_df[column]]
filtered = _filtered[ _filtered[column] < Q3+1.5*IQR ]
if column == 'booked_date_format':
print('Minimum value: ', min(day_df['Booked Date'].dropna()), ' Maximum Value: ', max(day_df['Booked Date'].dropna()))
elif column == 'date_format':
print('Minimum value: ', min(day_df['Date']), ' Maximum Value: ', max(day_df['Date']))
else:
print('Minimum value: ', min(day_df[column]), ' Maximum Value: ', max(day_df[column]))
print('Count of observations outside of Inter Quartile Range: ', len(day_df[column].dropna())-len(filtered[column].dropna()))
generate_viz_daily()
(data analysis, refer to outliers, extreme values, distribution, etc)
During Clustering: Decide cutoff levels for extreme values
The Monthly Bookings table contains bookings made in the platform in Portugal on a monthly basis, providing additional information through variables that were not accessible with the Daily Bookings table. As assessed previously, this is the data available in the table:
['Property ID', 'Host ID', 'Property Type', 'Listing Type', 'Bedrooms', 'Reporting Month', 'Occupancy Rate', 'Revenue (USD)', 'Revenue (Native)', 'ADR (USD)', 'ADR (Native)', 'Number of Reservations', 'Reservation Days', 'Available Days', 'Blocked Days', 'Country', 'State', 'City', 'Zip code', 'Neighborhood', 'Metropolitan Statistical Area', 'Latitude', 'Longitude', 'Active', 'Scraped During Month', 'Currency Native']
month_df = monthly[['Property ID', 'Host ID', 'Reporting Month', 'Occupancy Rate',
'Revenue (USD)', 'ADR (USD)', 'Number of Reservations',
'Reservation Days', 'Available Days', 'Blocked Days']]
It must be pointed that in this table the column 'Reporting Month' specifies year and month in string format, which we will need to format to Datetime.
Similar procedures to the ones previously applied (in order to assess how the data behaves) will be used:
# formatting Reporting Month do datetime:
month_df['Reporting Month'] = pd.to_datetime(month_df['Reporting Month'])
# Converting Created Date to days timedelta to allow plotting
month_df['reporting_month_formatted'] = pd.DatetimeIndex ( month_df['Reporting Month'] ).astype ( np.int64 )
print( 'Number of observations in dataset: ', len(month_df['Property ID']) )
# plotting interval variables
for column in month_df:
# Excluding columns with non interval data types
if column not in ['Property ID','Host ID', 'Reporting Month']:
# plot histogram
plt.subplot(1,2,1)
plt.hist(x=month_df[column].dropna(), bins=100)
# box plot
plt.subplot(1,2,2)
plt.boxplot(x=month_df[column].dropna())
plt.suptitle(column)
plt.subplots_adjust(wspace = 0.4)
plt.show()
# Computing IQR
Q1 = month_df[column].dropna().quantile(0.25)
Q3 = month_df[column].dropna().quantile(0.75)
IQR = Q3 - Q1
# Filtering Values between Q1-1.5IQR and Q3+1.5IQR
_filtered = month_df[ Q1-1.5*IQR < month_df[column]]
filtered = _filtered[ _filtered[column] < Q3+1.5*IQR ]
if column == 'reporting_month_formatted':
print('Minimum value: ', min(month_df['Reporting Month']), ' Maximum Value: ', max(month_df['Reporting Month']))
else:
print('Minimum value: ', min(month_df[column]), ' Maximum Value: ', max(month_df[column]))
print('Count of observations outside of Inter Quartile Range: ', len(month_df[column])-len(filtered[column]))
(data analysis, refer to outliers, extreme values, distribution, etc)
During Clustering: Decide cutoff levels for extreme values
The Listing Reviews table contains reviews made by users in the platform in Portugal on a monthly basis, as well as some details regarding the user's profile, namely First name, Country of origin, State (if the user is from the USA), City of origin, brief user description, last attended teaching institution and occupation. Again, this is the data available in the table, from which we will select only the variables that are relevant for this analysis:
['Property ID', 'Latitude', 'Longitude', 'Address', 'Review Date', 'Review Text', 'User ID', 'Member Since', 'First Name', 'Country', 'State', 'City', 'Description', 'School', 'Work', 'Profile Image URL', 'Profile URL']
# Data regarding house/listing information will not be sellected to avoid data replication
rev_df = reviews[[ 'Property ID', 'User ID', 'Review Date', 'Member Since', 'First Name',
'Country', 'State', 'City', 'Description', 'School', 'Work' ]]
A similar procedure will be applied (in order to assess how the data behaves):
# formatting Review Date and Member Since do datetime:
rev_df['Review Date'] = pd.to_datetime(rev_df['Review Date'])
rev_df['Member Since'] = pd.to_datetime(rev_df['Member Since'])
# Converting Created Date to days timedelta to allow plotting
rev_df['review_date_formatted'] = pd.DatetimeIndex ( rev_df['Review Date'] ).astype ( np.int64 )
rev_df['member_since_formatted'] = pd.DatetimeIndex ( rev_df['Member Since'] ).astype ( np.int64 )
rev_df = rev_df[rev_df['member_since_formatted']>0]
print( 'Number of observations in dataset: ', len(rev_df['Property ID']) )
# plotting interval variables
for column in rev_df:
# Excluding columns with non interval data types
if column not in ['Property ID','User ID', 'Review Date', 'Member Since', 'First Name', 'Country', 'State', 'City', 'Description', 'School', 'Work']:
print(column)
# plot histogram
plt.subplot(1,2,1)
plt.hist(x=rev_df[column].dropna(), bins=100)
# plot box plot
plt.subplot(1,2,2)
plt.boxplot(x=rev_df[column].dropna())
plt.suptitle(column)
plt.subplots_adjust(wspace = 0.4)
plt.show()
# Computing IQR
Q1 = rev_df[column].dropna().quantile(0.25)
Q3 = rev_df[column].dropna().quantile(0.75)
IQR = Q3 - Q1
# Filtering Values between Q1-1.5IQR and Q3+1.5IQR
_filtered = rev_df[ Q1-1.5*IQR < rev_df[column]]
filtered = _filtered[ _filtered[column] < Q3+1.5*IQR ]
if column == 'review_date_formatted':
print('Minimum value: ', min(rev_df['Review Date']), ' Maximum Value: ', max(rev_df['Review Date']))
elif column == 'member_since_formatted':
print('Minimum value: ', min(rev_df['Member Since']), ' Maximum Value: ', max(rev_df['Member Since']))
else:
print('Minimum value: ', min(rev_df[column]), ' Maximum Value: ', max(rev_df[column]))
print('Count of observations outside of Inter Quartile Range: ', len(rev_df[column])-len(filtered[column]))
We realize a very important detail after a first analysis: Data regarding customer profile is not standardized. In order to save space, this first analysis will not be displayed, but rather a final analysis, after preprocessing. So, we will be required to standardize it by parsing each person's country of origin from the input text received:
# use script developed: reviews_data_standardized.py
# This script uses geograpy, a library that helps parsing locations from text.
# As we are using python 3.6.3 and geograpy was developed for python 2.7, we will be required to use an adapted version
# This version is available in https://github.com/reach2ashish/geograpy, and can be installed by running:
# python3 -m pip install git+https://github.com/reach2ashish/geograpy.git
from reviews_data_standardized import preprocess_countries_reviews_table
preprocess_countries_reviews_table(reviews, preprocess_dir)
# Import output
reviews_pre = pd.read_csv(preprocess_dir+'Portugal_Review_Standardized.csv')
reviews_pre = reviews_pre[reviews_pre['Longitude'] != '2013-07-01']
reviews_pre.drop_duplicates()
reviews_pre['Longitude'].astype('float')
print(list(reviews_pre.columns))
# Select columns
rev_df = reviews_pre[[ 'Property ID', 'User ID', 'Review Date', 'Member Since', 'First Name',
'Country', 'State', 'City', 'Description', 'School', 'Work',
'country_after_parse', 'country_from_city_parse', 'final_country_parse' ]]
Finally, this is the result of our preprocessing:
# plot class variables
for column in ['country_after_parse', 'country_from_city_parse', 'final_country_parse']: #'Description', 'School', 'Work']:
# plot bar chart
rev_df[rev_df[column] != '-'][column].value_counts().nlargest(50).plot(kind='bar', figsize=(22, 6), title=column)
#plt.set_title(column)
plt.show()
print( 'Count of non-parsed origins: ', len(rev_df[rev_df[column] == '-'][column]) )
Comparing the final parse with the initial top 50 origins distribution:
# plot bar chart
reviews[reviews['Country'] != '-']['Country'].value_counts().nlargest(50).plot(kind='bar', figsize=(22, 6), title='Country')
#plt.set_title(column)
plt.show()
Finally, a sum up of all the analysis in all tables:
i=0
table_name = ['properties', 'daily', 'monthly', 'reviews']
for table in [properties, daily, monthly, reviews_pre]:
print(table_name[i])
i+=1
analysis = table.describe().T
analysis['na_count_percent'] = np.round(table.isnull().sum() / len(table) *100, 1)
analysis['na_count_percent'] = analysis['na_count_percent'].astype('str')+'%'
analysis = analysis.reset_index()
columns = list(analysis.columns)
columns[0] = 'columns'
analysis.columns = columns
trace = go.Table(
header=dict(values=analysis.columns, fill = dict(color='#C2D4FF'), align = ['center'] * 5),
cells=dict(values=[analysis[column] for column in analysis.columns],fill = dict(color='#F5F8FF'),
align = ['left'] * 5))
plotly.offline.iplot([trace], show_link=False)
Aside from the excessively high revenue values in the table Property Listings, we did not detect any other value that we were certain to be wrong. So, we conclude that our dataset doesn't have values that require its removal for this reason. Although, we did find some outliers.
In order to maintain a simple, straightforward analysis, we will use the Interquartile Range method (IQR) to remove these extreme values:
1) IQR = Q3 – Q1
2) If a data point is below Q1 – 1.5×IQR or above Q3 + 1.5×IQR, it is viewed as being too far from the central values to be reasonable.
After this, we will have two options: Remove these observations, or set these values as missing and reapply the method used in the previous step.
# handle outliers here
tables_filtered = dict(properties = properties,
daily = daily,
monthly = monthly,
reviews = reviews_pre)
# Filtering Values between Q1-1.5IQR and Q3+1.5IQR
for table in list(tables_filtered.keys()):
if table != 'properties' and table != 'reviews':
#print(' ',table)
for column in list(tables_filtered[table].columns):
if column in ['Price (USD)', 'Available Days']:
#print(column)
Q1 = tables_filtered[table][column].dropna().quantile(0.25)
Q3 = tables_filtered[table][column].dropna().quantile(0.75)
IQR = Q3 - Q1
_filtered = tables_filtered[table][ Q1-1.5*IQR < tables_filtered[table][column]]
filtered = _filtered[ _filtered[column] < Q3+1.5*IQR ]
tables_filtered[table] = filtered
#print(len(tables_filtered[table]))
elif table == 'properties':
#print(' ',table)
tables_filtered[table] = tables_filtered[table][tables_filtered[table]['Annual Revenue LTM (USD)'] < 10000000]
elif table == 'reviews':
tables_filtered[table] = tables_filtered[table].drop_duplicates()
#tbls = list(tables_filtered.keys())
print('Number of observations filtered:')
print('properties',': ', (len(tables['properties'])-len(tables_filtered['properties'])))
print('daily',': ', (len(tables['daily'])-len(tables_filtered['daily'])))
print('monthly',': ', (len(tables['monthly'])-len(tables_filtered['monthly'])))
print('reviews',': ', (len(tables['reviews'])-len(tables_filtered['reviews'])))
In order to proceed to the next step of the analysis, we will need to handle missing values. As such, we will check for inexistent values in all variables from the 3 different tables, displayed in the previous set of tables.
nan_props = properties.isnull().sum()
nan_daily = daily.isnull().sum()
nan_monthly = monthly.isnull().sum()
nan_reviews = reviews.isnull().sum()
# create dict with NaN table counts for loop
nans = dict(nan_props = nan_props,
nan_daily = nan_daily,
nan_monthly = nan_monthly,
nan_reviews = nan_reviews)
# set dict with total size of data sets
lengths = dict(nan_props = len(properties),
nan_daily = len(daily),
nan_monthly = len(monthly),
nan_reviews = len(reviews))
nan_tables = []
for nan_count in nans:
nan_counts = pd.DataFrame(nans[nan_count], columns= [ 'nan_count' ]).reset_index()
nan_counts.columns = ['columns','nan_count']
# calculate percentage over the total size of the data set for relative analysis
nan_counts['nan%'] = nan_counts['nan_count'].apply(lambda x: (x/lengths[nan_count])*100)
nan_tables.append(nan_counts)
# configure table
final_nan = pd.concat(nan_tables, ignore_index=True, axis=1)
final_nan.columns = ['Properties', 'nan_p', 'nan%_p', 'Daily', 'nan_d', 'nan%_d',
'Monthly', 'nan_m', 'nan%_m', 'Reviews', 'nan_r', 'nan%_r']
# replace NaN's where there are no columns to show
final_nan = final_nan.fillna(value='-')
# configure table viz
trace = go.Table(
header=dict(values=final_nan.columns, fill = dict(color='#C2D4FF'), align = ['left'] * 5),
cells=dict(values=[final_nan[column] for column in final_nan.columns],fill = dict(color='#F5F8FF'),
align = ['left'] * 5))
plotly.offline.iplot([trace], show_link=False)
Here we see that there are some columns that have a very high percentage of missing values, which we must consider whether to use them or not and how can we proceed to fill the missing values in the variables we will be using. To do this, we must consider each variable's importance to our goal: Tourism profiling and flows analysis.
Once again, given the size of the present datasets, we will use the median to fill the missing values in the necessary columns, i.e., the ones we will be using for the remainder of the analysis:
medians_props = tables_filtered['properties'].median(skipna=True)
medians_monthly = tables_filtered['monthly'].median(skipna=True)
na_properties = { 'Average Daily Rate (Native)': medians_props['Average Daily Rate (Native)'] ,
'Average Daily Rate (USD)': medians_props['Average Daily Rate (USD)'],
'Occupancy Rate LTM': medians_props['Occupancy Rate LTM'],
'Number of Bookings LTM': medians_props['Number of Bookings LTM'],
'Number of Reviews': medians_props['Number of Reviews'],
'Max Guests': medians_props['Max Guests'],
'Published Monthly Rate (USD)': medians_props['Published Monthly Rate (USD)'] }
na_monthly = { 'Revenue (Native)': medians_monthly['Revenue (Native)'] ,
'Occupancy Rate': medians_monthly['Occupancy Rate'],
'Bedrooms': medians_monthly['Bedrooms']}
tables_filtered['properties'] = tables_filtered['properties'].fillna(value=na_properties)
tables_filtered['monthly'] = tables_filtered['monthly'].fillna(value=na_monthly)
"the stats are inaccurate. The number of blocked days is wrong. The number of booked dates is wrong. The number of unbooked days is wrong. Clearly, AirBNB is not doing their own calculations correctly. Thankfully, the SuperHost numbers are correct."
It is impossible to assess whether the number of reviews is representative, given that Airbnb's accuracy in their data is not certain. Furthermore, Airbnb's reviews can be either public or private. As we are only using publicly available data, we do not have access to user data that left a private review, or no review at all. So, we are analysing user profiles that represent 10% of the total bookings that were actually completed. Although the user data sample extracted from the overall reviews was not randomly generated, it is highly representative. The number of total completed bookings made between September 1st 2014 and December 31st 2017 is 11.550 million Bookings which implies a minimum sample size of 16564 for a 99% confidence level and 1% margin of error. As our sample has a size of 1.2 million observations, it is statistically significant (although, we cannot conclude that it is an unbiased sample, as it was not randomly selected).
properties_column_list = [column for column in tables_filtered['properties'].columns]
properties_column_list.remove('Zipcode')
properties_column_list.remove('Metropolitan Statistical Area')
properties_column_list.remove('Property ID')
properties_column_list.remove('Host ID')
properties_column_list.remove('Last Scraped Date')
properties_column_list.remove('Country')
properties_column_list.remove('State')
properties_column_list.remove('City')
properties_column_list.remove('Neighborhood')
corr_matrix = tables_filtered['properties'][properties_column_list].corr(method='pearson')
trace = go.Heatmap( z= corr_matrix.values.tolist(),
x= properties_column_list,
y= properties_column_list,
)
data = go.Data([trace])
layout = dict( margin = dict(t=50,r=50,b=150,l=150))
figure = dict(data=data , layout=layout )
plotly.offline.plot(figure, filename=viz_out+'correlation_matrix_test.html', show_link=True, auto_open=True)
plotly.offline.iplot(figure, show_link=False)
(Insights taken from the correlation matrix goes here, and there are plenty)
# select variables
geo = tables_filtered['properties'][[ 'Property ID', 'Latitude', 'Longitude' ]]
geo = geo[geo['Longitude'] > -12]
# Define K-means algorithm
km = sk_cluster.KMeans(n_clusters=10, random_state=None)
km.fit( geo[['Latitude', 'Longitude']] )
# Get Cluster assignment Labels
labels = km.labels_
geo['cluster'] = labels
data = go.Data([
go.Scattermapbox(
lat = geo['Latitude'],
lon = geo['Longitude'],
mode='markers',
marker=go.Marker(size= 2,
color=geo['cluster'] ),
text = geo['cluster']
)
])
layout = go.Layout(
title='Location of listings',
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_token,
bearing=0,
style='dark',
center=dict(
lat=39.64,
lon=-7.95,
),
pitch=0,
zoom=4.2
),
)
wonder_map = go.Figure(data=data, layout=layout)
#generate interactive visualization
plotly.offline.plot(wonder_map, filename=viz_out+'geographic_clustering.html', show_link=True, auto_open=False)
plotly.offline.iplot(wonder_map, show_link=False)
In this clustering analysis we will cluster listings by economic and property features, as well as details in Airbnb platform such as the number of pictures in a listing.
# select variables
val = tables_filtered['properties'][[ 'Property ID', 'Latitude', 'Longitude', 'Annual Revenue LTM (USD)', 'Average Daily Rate (USD)',
'Occupancy Rate LTM', 'Number of Bookings LTM',# 'Number of Reviews', <- highly correlated with number of bookings
'Bedrooms', 'Bathrooms', 'Max Guests' ]]
val = val.fillna(val.mean())
#val = val[val['Longitude'] > -12]
# Define K-means algorithm
km = sk_cluster.KMeans(n_clusters=10, random_state=None)
km.fit( val[['Annual Revenue LTM (USD)', 'Average Daily Rate (USD)', 'Occupancy Rate LTM', 'Number of Bookings LTM'#,
# 'Number of Reviews'
]] )
# Get Cluster assignment Labels
labels = km.labels_
val['val_cluster'] = labels
data = go.Data([
go.Scattermapbox(
lat = val['Latitude'],
lon = val['Longitude'],
mode='markers',
marker=go.Marker(size= 2,
color=val['val_cluster'] ),
text = val['val_cluster']
)
])
layout = go.Layout(
title='Location of listings',
autosize=True,
hovermode='closest',
mapbox=dict(
accesstoken=mapbox_token,
bearing=0,
style='dark',
center=dict(
lat=39.64,
lon=-7.95,
),
pitch=0,
zoom=4.2
),
)
super_map = go.Figure(data=data, layout=layout)
#generate interactive visualization
plotly.offline.plot(super_map, filename=viz_out+'value_clustering.html', show_link=True, auto_open=False)
plotly.offline.iplot(super_map, show_link=False)
We will need to merge and adapt our data (group it by value cluster, region, month, etc) in order to analyse it further:
# Select the tables we will use
rev = tables_filtered['reviews'][['User ID', 'Property ID', 'final_country_parse', 'Review Date']]
prop = pd.merge(tables_filtered['properties'][[ 'Property ID', 'Average Daily Rate (USD)', 'Listing Type', 'Bedrooms' ]], geo[['Property ID', 'cluster']], on='Property ID', how='left')
day = tables_filtered['daily'][tables_filtered['daily']['Status'] == 'R'][['Property ID', 'Booked Date', 'Date', 'Price (USD)']]
mon = tables_filtered['monthly'][[ 'Property ID', 'Reporting Month', 'Occupancy Rate', 'ADR (USD)', 'Number of Reservations', 'Revenue (USD)' ]]
# Merge tables into 2
tourists = pd.merge(rev, prop, how= 'left', on= 'Property ID')
locations = pd.merge(mon, prop, how = 'left', on= 'Property ID')
# Format columns
tourists['Review Date'] = pd.to_datetime(tourists['Review Date'])
locations['Reporting Month'] = pd.to_datetime(locations['Reporting Month'])
locations['Review Date'] = locations['Reporting Month']
tourists['Listing Type'] = tourists['Listing Type'].fillna('unknown')
tourists['cluster'] = tourists['cluster'].fillna(100)
tourists['cluster'] = tourists['cluster'].astype('int')
# define class variables by 1 or 0's for each column to determine percentages afterwards
def get_cluster(value, cluster):
if value == cluster:
ones = 1
else:
ones = 0
return ones
# apply function to the class type columns
cluster_nums = list(range(0,10))
cluster_nums.append(100)
cluster_nums_columns = ['cluster_'+str(cluster) for cluster in cluster_nums]
for cluster in cluster_nums:
column_name = 'cluster_'+str(cluster)
tourists[column_name] = tourists['cluster'].apply(lambda x: get_cluster(x, cluster))
# apply function to the class type columns
property_type_index = list(tourists['Listing Type'].unique())
property_type_columns = [ 'listing_type_'+str(listing) for listing in property_type_index]
for listing_type in property_type_index:
column_name = 'listing_type_'+str(listing_type)
tourists[column_name] = tourists['Listing Type'].apply(lambda x: get_cluster(x, listing_type))
# apply function to the class type columns
origins_index = list(rev_df['final_country_parse'].value_counts().nlargest(30).keys())
origins_index.remove('French Southern Territories')
origins_index.remove('Saint Helena, Ascension and Tristan da Cunha')
origins_index.remove('Lao People\'s Democratic Republic')
for origin in origins_index:
tourists[origin] = tourists['final_country_parse'].apply(lambda x: get_cluster(x, origin))
# generate final tables for visualization
origins_grouped = tourists.drop(origins_index, axis=1).groupby(['final_country_parse', 'Review Date']).mean()
origins_grouped['review_count'] = tourists.groupby(['final_country_parse', 'Review Date']).size()
regions_grouped = tourists.drop(cluster_nums_columns, axis=1).groupby(['cluster', 'Review Date']).mean()
regions_grouped['review_count'] = tourists.groupby(['cluster', 'Review Date']).size()
regions_grouped = regions_grouped.reset_index()
pre_merg = locations[[ 'cluster', 'Review Date', 'Occupancy Rate', 'Number of Reservations', 'Revenue (USD)' ]]
pre_merg1 = pre_merg.groupby(['cluster', 'Review Date']).mean()
pre_merg2 = pre_merg.groupby(['cluster', 'Review Date']).sum()
pre_merg3 = pre_merg1[[ 'Occupancy Rate' ]]
pre_merg3['Sum of Reservations'] = pre_merg2['Number of Reservations']
pre_merg3['projected_revenue'] = pre_merg2['Revenue (USD)']
pre_merg3['Avg Occupancy Rate'] = pre_merg3['Occupancy Rate']
final_merg = pre_merg3[[ 'Sum of Reservations', 'Avg Occupancy Rate', 'projected_revenue' ]].reset_index()
regions_grouped = pd.merge(regions_grouped, final_merg, on=['cluster', 'Review Date'], how='left')
# export tables
regions_grouped.to_csv(preprocess_dir+'portuguese_regions_clusters.csv')
origins_grouped.to_csv(preprocess_dir+'countries_of_origin.csv')
trace_list = []
for cluster in cluster_nums:
trace = {
"x": regions_grouped[regions_grouped['cluster'] == cluster]['Review Date'],
"y": regions_grouped[regions_grouped['cluster'] == cluster]['Sum of Reservations'],
"mode": "lines",
"name": cluster,
"text": "Cluster "+str(cluster),
}
trace_list.append(trace)
data = go.Data(trace_list)
layout = {
"hovermode": "closest",
"title": "Number of reservations per cluster",
"xaxis": {
"autorange": False,
"range": ["2014-08-31 15:42:21.1765", "2017-12-04 18:52:56.4706"],
"title": "Review Date",
"type": "date"
},
"yaxis": {
"autorange": True,
# "range": [-5791.0, 110029],
"title": "Sum of Reservations",
"type": "linear"
}
}
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename=viz_out+'reservation_growth_per_region.html', show_link=True, auto_open=False)
plotly.offline.iplot(fig, show_link=False)
trace_list = []
for cluster in cluster_nums:
trace = {
"x": regions_grouped[regions_grouped['cluster'] == cluster]['Review Date'],
"y": regions_grouped[regions_grouped['cluster'] == cluster]['projected_revenue'],
"mode": "lines",
"name": cluster,
"text": "Cluster "+str(cluster),
}
trace_list.append(trace)
data = go.Data(trace_list)
layout = {
"hovermode": "closest",
"title": "Revenue growth per cluster",
"xaxis": {
"autorange": False,
"range": ["2014-08-31 15:42:21.1765", "2017-12-04 18:52:56.4706"],
"title": "Review Date",
"type": "date"
},
"yaxis": {
"autorange": True,
# "range": [-5791.0, 110029],
"title": "Sum of Reservations",
"type": "linear"
}
}
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename=viz_out+'revenue_growth_per_region.html', show_link=True, auto_open=False)
plotly.offline.iplot(fig, show_link=False)
trace_list = []
for cluster in cluster_nums:
trace = {
"x": regions_grouped[regions_grouped['cluster'] == cluster]['Review Date'],
"y": regions_grouped[regions_grouped['cluster'] == cluster]['projected_revenue'] / regions_grouped[regions_grouped['cluster'] == cluster]['Sum of Reservations']
,
"mode": "lines",
"name": cluster,
"text": "Cluster "+str(cluster),
}
trace_list.append(trace)
data = go.Data(trace_list)
layout = {
"hovermode": "closest",
"title": "Revenue per reservation",
"xaxis": {
"autorange": False,
"range": ["2014-08-31 15:42:21.1765", "2017-12-04 18:52:56.4706"],
"title": "Review Date",
"type": "date"
},
"yaxis": {
"autorange": True,
# "range": [-5791.0, 110029],
"title": "Sum of Reservations",
"type": "linear"
}
}
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename=viz_out+'revenue_per_reservation_per_region.html', show_link=True, auto_open=False)
plotly.offline.iplot(fig, show_link=False)
pies = []
domains = [
{'x': [0.0, 0.33], 'y': [0.77, 1.00]}, #9
{'x': [0.33, 0.66], 'y': [0.77, 1.00]}, #8
{'x': [0.66, 0.99], 'y': [0.77, 1.00]}, #7
{'x': [0.0, 0.33], 'y': [0.52, 0.75]}, #6
{'x': [0.33, 0.66], 'y': [0.52, 0.75]}, #5
{'x': [0.66, 0.99], 'y': [0.52, 0.75]}, #4
{'x': [0.0, 0.33], 'y': [0.27, 0.50]}, #3
{'x': [0.33, 0.66], 'y': [0.27, 0.50]}, #2
{'x': [0.66, 0.99], 'y': [0.27, 0.50]}, #1
{'x': [0.33, 0.66], 'y': [0.02, 0.25]}, #0
]
for cluster in range(0,10):
origins_per_cluster = tourists[['cluster', 'Review Date', 'final_country_parse']]
filter_ = origins_per_cluster['cluster'] == cluster
filter2 = origins_per_cluster['final_country_parse'] != '-'
vals = origins_per_cluster[filter_][filter2][['final_country_parse', 'cluster']].groupby('final_country_parse').count()
other_count = vals['cluster'].sum() - vals.nlargest(15, 'cluster' )['cluster'].sum()
pie_data = vals.nlargest(15, 'cluster' )
pie_data.loc['others'] = [other_count]
labels = list(pie_data.index)
values = list(pie_data['cluster'])
name = 'Region %s' % str(cluster)
a_nice_pie = go.Pie(values=values, labels=labels, name=name, domain=domains[cluster])#, title= 'Cluster '+str(cluster))
pies.append(a_nice_pie)
layout = go.Layout(autosize = True,
title = 'Nationality Representation For Each Region')
fig = go.Figure(data = pies, layout = layout)
plotly.offline.plot(fig, filename=viz_out+'pies_nacionality_per_region.html', show_link=True, auto_open=False)
plotly.offline.iplot(fig, show_link=False)
# apply clustering logic to value clustering before tuesday!!!
origins_grouped_ovrl = tourists.drop(origins_index, axis=1).groupby(['final_country_parse']).mean()
origins_grouped_ovrl['review_count'] = tourists.groupby(['final_country_parse']).size()
origins_grouped_ovrl.columns
origins_grouped_ovrl = origins_grouped_ovrl[[ 'review_count', 'Average Daily Rate (USD)', 'Bedrooms', 'cluster_0',
'cluster_1', 'cluster_2', 'cluster_3', 'cluster_4', 'cluster_5',
'cluster_6', 'cluster_7', 'cluster_8', 'cluster_9', 'cluster_100',
'listing_type_Entire home/apt', 'listing_type_Private room',
'listing_type_unknown', 'listing_type_Shared room' ]]
filter_ = origins_grouped_ovrl['review_count'] > 1000
origins_grouped_ovrl = origins_grouped_ovrl[filter_]
origins_grouped_ovrl['most_visited_cluster'] = '-'
origins_grouped_ovrl['second_most_visited'] = '-'
origins_grouped_ovrl['third_most_visited'] = '-'
for row in range(len(origins_grouped_ovrl)):
highest_value = 0
second_highest_value = 0
third_highest_value = 0
highest_i = 0
second_highest_i = 0
third_highest_i = 0
for i in range(0,10):
current_value = origins_grouped_ovrl['cluster_'+str(i)][row]
if current_value > third_highest_value :
if current_value > second_highest_value:
if current_value > highest_value:
third_highest_i = second_highest_i
second_highest_i = highest_i
highest_i = i
third_highest_value = second_highest_value
second_highest_value = highest_value
highest_value = current_value
else:
third_highest_i = second_highest_i
second_highest_i = i
third_highest_value = second_highest_value
second_highest_value = current_value
else:
third_highest_value = current_value
third_highest_i = i
origins_grouped_ovrl['most_visited_cluster'][row] = highest_i
origins_grouped_ovrl['second_most_visited'][row] = second_highest_i
origins_grouped_ovrl['third_most_visited'][row] = third_highest_i
origins_grouped_ovrl.nlargest(30, 'review_count' )
trace_list = []
for cluster in cluster_nums:
trace = {
"x": regions_grouped[regions_grouped['cluster'] == cluster]['Review Date'],
"y": regions_grouped[regions_grouped['cluster'] == cluster]['Avg Occupancy Rate'],
"mode": "lines",
"name": cluster,
"text": "Cluster "+str(cluster),
}
trace_list.append(trace)
data = go.Data(trace_list)
layout = {
"hovermode": "closest",
"title": "Average Occupancy Rate per region",
"xaxis": {
"autorange": False,
"range": ["2014-08-31 15:42:21.1765", "2017-12-04 18:52:56.4706"],
"title": "Review Date",
"type": "date"
},
"yaxis": {
"autorange": True,
# "range": [-5791.0, 110029],
"title": "Avg Occupancy Rate",
"type": "linear"
}
}
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename=viz_out+'occupancy_rate_per_region.html', show_link=True, auto_open=False)
plotly.offline.iplot(fig, show_link=False)
origins_per_cluster = tourists[['Review Date', 'final_country_parse']]
origins_per_cluster_1 = origins_per_cluster_1.groupby(by=['final_country_parse']).size().to_frame(name='count').reset_index()
origins_per_cluster_1 = origins_per_cluster_1[origins_per_cluster_1['final_country_parse'] != '-'].nlargest(14, 'count')
countries = list(origins_per_cluster_1['final_country_parse'])
def top_countries_filter(country):
if country not in countries:
country = 'others'
return country
origins_per_cluster_2 = origins_per_cluster
origins_per_cluster_2 = origins_per_cluster_2[origins_per_cluster_2['final_country_parse'] != '-']
origins_per_cluster_2['final_country_parse'] = origins_per_cluster_2['final_country_parse'].apply(top_countries_filter)
origins_per_cluster_2 = origins_per_cluster_2.groupby(by=['Review Date', 'final_country_parse']).size().to_frame(name='count')
origins_per_cluster_2 = origins_per_cluster_2.reset_index()
origins_per_cluster_2
trace_list = []
countries.append('others')
for country in countries:
filter__ = origins_per_cluster_2['final_country_parse'] == country
trace = {
"x": origins_per_cluster_2[filter__]['Review Date'],
"y": origins_per_cluster_2[filter__]['count'],
"mode": "lines",
"name": country,
"text": country,
}
trace_list.append(trace)
data = go.Data(trace_list)
layout = {
"hovermode": "closest",
"title": "Count of booking reviews per Nationality",
"xaxis": {
"autorange": False,
"range": ["2014-08-31 15:42:21.1765", "2017-12-04 18:52:56.4706"],
"title": "Review Date",
"type": "date"
},
"yaxis": {
"autorange": True,
# "range": [-5791.0, 110029],
"title": "Count",
"type": "linear"
}
}
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename=viz_out+'count_of_reviews_per_nationality.html', show_link=True, auto_open=False)
plotly.offline.iplot(fig, show_link=False)
print(len(reviews))
print(len(tables_filtered['reviews']))
print(len(tables_filtered['reviews'].drop_duplicates()))
print(len(tables['reviews']))
print(len(tables['reviews'].drop_duplicates()))
"""
origins_per_cluster = tourists[['cluster', 'final_country_parse']]
origins_per_cluster_1 = origins_per_cluster_1.groupby(by=['final_country_parse']).size().to_frame(name='count').reset_index()
origins_per_cluster_1 = origins_per_cluster_1[origins_per_cluster_1['final_country_parse'] != '-'].nlargest(14, 'count')
countries = list(origins_per_cluster_1['final_country_parse'])
def top_countries_filter(country):
if country not in countries:
country = 'others'
return country
origins_per_cluster_2 = origins_per_cluster
origins_per_cluster_2['final_country_parse'] = origins_per_cluster_2['final_country_parse'].apply(top_countries_filter)
origins_per_cluster_2 = origins_per_cluster_2.groupby(by=['cluster', 'final_country_parse']).size().to_frame(name='count')
origins_per_cluster_2 = origins_per_cluster_2.reset_index()
origins_per_cluster_2 = origins_per_cluster_2[origins_per_cluster_2['final_country_parse'] != '-']
origins_per_cluster_2 = origins_per_cluster_2[origins_per_cluster_2['cluster'] != 100]
origins_per_cluster_2.to_csv('origins_per_cluster_cluster.csv')
"""